- 
                Notifications
    You must be signed in to change notification settings 
- Fork 171
X64: Use Struct for ELF Ehdr and Phdr headers #1694
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
| In this PR, I am attempting to decouple the assembly instructions and elf headers. @certik please, could you review and share if we are heading in desired direction? | 
| Also, using the approach in this PR, how shall we support generating assembly text format for the Elf headers? | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this looks good. Don't forget to remove the tmp binary.
        
          
                src/libasr/codegen/x86_assembler.cpp
              
                Outdated
          
        
      | Elf64_Phdr get_program_header(X86Assembler &a) { | ||
| Elf64_Phdr p; | ||
| p.type = 1; | ||
| p.flags = 4; | ||
| p.offset = sizeof(Elf64_Phdr); | ||
| p.vaddr = a.origin(); | ||
| p.paddr = a.origin(); | ||
| p.filesz = sizeof(Elf64_Phdr); | ||
| p.memsz = sizeof(Elf64_Phdr); | ||
| p.align = 0x1000; | ||
| return p; | ||
| } | ||
|  | ||
| Elf64_Phdr get_text_segment(X86Assembler &a, Elf64_Phdr &p_program) { | ||
| Elf64_Phdr p; | ||
| p.type = 1; | ||
| p.flags = 5; | ||
| p.offset = p_program.offset + sizeof(p); | ||
| p.vaddr = a.origin() + p.offset; | ||
| p.paddr = a.origin() + p.offset; | ||
| p.filesz = sizeof(Elf64_Phdr); | ||
| p.memsz = sizeof(Elf64_Phdr); | ||
| p.align = 0x1000; | ||
| return p; | ||
| } | ||
|  | ||
| Elf64_Phdr get_data_segment(X86Assembler &a, Elf64_Phdr &p_text_seg) { | ||
| Elf64_Phdr p; | ||
| p.type = 1; | ||
| p.flags = 6; | ||
| p.offset = p_text_seg.offset + sizeof(p_text_seg); | ||
| p.vaddr = a.origin() + p.offset; | ||
| p.paddr = a.origin() + p.offset; | ||
| p.filesz = sizeof(Elf64_Phdr); | ||
| p.memsz = sizeof(Elf64_Phdr); | ||
| p.align = 0x1000; | ||
| return p; | ||
| } | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These functions only use a.origin(), so I would just pass origin as an integer parameter, not the whole x86Assembler.
        
          
                src/libasr/codegen/x86_assembler.cpp
              
                Outdated
          
        
      | e.type = 2; | ||
| e.machine = 0x3e; | ||
| e.version = 1; | ||
| e.entry = a.get_defined_symbol("_start").value; | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would pass a.get_defined_symbol("_start").value as an argument, not a.
        
          
                src/libasr/codegen/x86_assembler.cpp
              
                Outdated
          
        
      | out.write((const char*) m_code.p, m_code.size()); | ||
|  | ||
| out.close(); | ||
| } | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would keep this function outside of X86Assembler. Eventually once we add Arm64Assembler, we will figure out how to generalize this function to also work for ELF ARM binaries.
Also I would rename it to something like create_elf64_x86_binary and I would pass m_code as an argument.
I would make the function return a string.
The caller will then save the string to a file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would make the function return a string.
I currently used a Vec<uint8_t>. I would like to know if it is fine or if I shall update it.
43021ba    to
    88aa79b      
    Compare
  
    | Example: (lp) lpython$ cat integration_tests/test_complex_01.py 
from lpython import i32, i64, f32, f64, c32, c64
def test_real_imag():
    x: c64
    x = c64(2) + 3j
    a: f64
    b: f64
    eps: f64
    eps = 1e-12
    a = x.real
    b = x.imag
    assert abs(a - 2.0) <= eps
    assert abs(b - 3.0) <= eps
    print(x)
def test_complex():
    x: c64
    x = complex(4.5, 6.7)
    eps: f64
    eps = 1e-12
    assert abs(x.real - 4.5) <= eps
    assert abs(x.imag - 6.7) <= eps
    x = complex(-4, 2)
    assert abs(x.real - (-4.0)) <= eps
    assert abs(x.imag - 2.0) <= eps
    x = complex(4, 7.89)
    assert abs(x.real - 4.0) <= eps
    assert abs(x.imag - 7.89) <= eps
    x = complex(5.6, 0)
    assert abs(x.real - 5.6) <= eps
    assert abs(x.imag - 0.0) <= eps
    a: f64
    a = 534.6
    x = complex(a, -a) # (f64, f64)
    assert abs(x.real - 534.60000000000002274) <= eps
    assert abs(x.imag - (-534.60000000000002274)) <= eps
    a2: f32
    a2 = -f32(423.5430806348152437)
    a3: f32
    a3 = f32(34.5)
    x2: c32
    x2 = c32(complex(a2, a3)) # (f32, f32)
    assert f64(abs(x2.imag - f32(34.5))) <= eps
    i1: i32
    i1 = -5
    i2: i64
    i2 = -i64(6)
    x = complex(a3, a) # (f32, f64)
    x = complex(a, a3) # (f64, f32)
    x = complex(i1, i2) # (i32, i64)
    x = complex(i1, -i1) # (i32, i32)
    x = complex(-i2, -i2) # (i64, i64)
    x = complex(i2, -i1) # (i64, i32)
    print(x)
def test_complex_unary_minus():
    c: c32
    c = c32(complex(3, 4.5))
    _c: c32
    _c = -c
    assert abs(f64(_c.real) - (-3.0)) <= 1e-12
    assert abs(f64(_c.imag) - (-4.5)) <= 1e-12
    _c = c32(complex(5, -78))
    _c = -_c
    assert abs(f64(_c.real) - (-5.0)) <= 1e-12
    assert abs(f64(_c.imag) - 78.0) <= 1e-12
    c2: c64
    c2 = complex(-4.5, -7.8)
    c2 = -c2
    assert abs(c2.real - 4.5) <= 1e-12
    assert abs(c2.imag - 7.8) <= 1e-12
    c2 = c64(3) + 4j
    c2 = -c2
    assert abs(c2.real - (-3.0)) <= 1e-12
    assert abs(c2.imag - (-4.0)) <= 1e-12
    print(c, _c, c2)
def test_complex_not():
    c: c32
    c = c32(complex(4, 5))
    b: bool
    b = not c
    assert not b
    c2: c64
    c2 = complex(0, 0)
    b = not c2
    assert b
    print(c,c2, b)
def check():
    test_real_imag()
    test_complex()
    test_complex_unary_minus()
    test_complex_not()
check()
(lp) lpython$ lpython integration_tests/test_complex_01.py --backend wasm -o tmp.out
(lp) lpython$ wasmtime tmp.out
(2.000000000,3.000000000)
(-6.000000000,5.000000000)
(3.000000000,4.50000000) (-5.000000000,78.000000000) (-3.000000000,-4.000000000)
(4.000000000,5.000000000) (0.000000000,0.000000000) 1
(lp) lpython$ lpython integration_tests/test_complex_01.py --backend wasm_x64 -o tmp2.out > tmp2.asm
(lp) lpython$ ./tmp2.out 
(2.000000000,3.000000000)
(-6.000000000,5.000000000)
(3.000000000,4.50000000) (-5.000000000,78.000000000) (-3.000000000,-4.000000000)
(4.000000000,5.000000000) (0.000000000,0.000000000) 1
(wasm_asm) lpython$ nasm -fbin tmp2.asm && chmod +x tmp2
(wasm_asm) lpython$ ./tmp2
(2.000000000,3.000000000)
(-6.000000000,5.000000000)
(3.000000000,4.50000000) (-5.000000000,78.000000000) (-3.000000000,-4.000000000)
(4.000000000,5.000000000) (0.000000000,0.000000000) 1 | 
| This is ready. Please review and share feedback. | 
|  | ||
| for (auto b:a.get_machine_code()) { | ||
| bin.push_back(al, b); | ||
| } | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this copying the (potentially big) binary code byte by byte?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it is copying the binary code byte by byte. Another approach that we can do is to just return the ELF and Phdr headers in create_elf64_x86_binary() (we can rename it as create_elf64_x86_header()).
For example:
void X86Assembler::save_binary64(const std::string &filename) {
    Vec<uint8_t> header = create_elf64_x86_header(m_al, *this);
    {
        std::ofstream out;
        out.open(filename);
        out.write((const char*) header.p, header.size());
        out.write((const char*) m_code.p, m_code.size());
        out.close();
    }
#ifdef LFORTRAN_LINUX
    std::string mode = "0755";
    int mod = strtol(mode.c_str(), 0, 8);
    if (chmod(filename.c_str(),mod) < 0) {
        throw AssemblerError("chmod failed");
    }
#endif
}There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I currently updated it to return the header only.
| I think it looks ok, but I am a bit worried about performance. Can you benchmark this compared to master? | 
| Please find the benchmark with respect to main. Benchmark used #1222 (comment) ( generate benchmark: python examples/expr2.py > bench.pymain branch: (lp) lpython$ time lpython --backend=wasm_x64 bench.py -o bench_wasm_x64.x
real    0m12.138s
user    0m12.079s
sys     0m0.060sThis PR: (lp) lpython$ time lpython --backend=wasm_x64 bench.py -o bench_wasm_x64.x
real    0m12.963s
user    0m12.771s
sys     0m0.084sRelative: (real time considered here) 
 | 
| The timing seems similar now (The timing on  Main: (lp) lpython$ time lpython --backend=wasm_x64 bench.py -o bench_wasm_x64.x
real    0m11.905s
user    0m11.842s
sys     0m0.061sThis PR: (lp) lpython$ time lpython --backend=wasm_x64 bench.py -o bench_wasm_x64.x
real    0m11.877s
user    0m11.802s
sys     0m0.065s | 
| This is ready. | 
| Did you benchmark LPython compiled in Release mode? | 
| 
 Sorry, it is in debug mode. Do we need release mode? From #1222 (comment), it seems in release mode, the timing difference could be lesser. Thus, I thought that in debug mode, we could have enlarged timing difference which could act as a better or more stricter value. | 
| We should only do benchmarks in Release mode, as in Debug mode there are all kinds of asserts and checks that are not executed in Release mode, so they are not relevant. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this looks good now. Thanks for this refactoring!
| 
 Shall we merge before release benchmarking or after release benchmarking? | 
Co-authored-by: Ondřej Čertík <[email protected]>
| On this   #!/usr/bin/env bash
set -e
set -x
cmake \
    -DCMAKE_BUILD_TYPE=Release \
    -DWITH_LLVM=yes \
    -DLPYTHON_BUILD_ALL=yes \
    -DWITH_STACKTRACE=no \
    -DWITH_RUNTIME_STACKTRACE=no \
    -DWITH_LSP=no \
    -DWITH_LFORTRAN_BINARY_MODFILES=no \
    -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH_LPYTHON;$CONDA_PREFIX" \
    -DCMAKE_INSTALL_PREFIX=`pwd`/inst \
    .
cmake --build . -j16 --target installI am receiving the following timing values on AMD Ryzen 5 2500U with Radeon Vega Mobile Gfx @1.600 GHz, Ubuntu 22.04.4 LTS: (lp) lpython$ time lpython bench.py --backend wasm_x86 -o tmp
real    0m3.393s
user    0m3.340s
sys     0m0.053s
(lp) lpython$ time lpython bench.py --backend x86 -o tmp
real    0m2.736s
user    0m2.683s
sys     0m0.054s
(lp) lpython$ time lpython bench.py --backend wasm_x64 -o tmp
real    0m3.457s
user    0m3.417s
sys     0m0.040s
(lp) lpython$ time lpython bench.py --backend llvm -o tmp
real    0m30.106s
user    0m29.968s
sys     0m0.129s
(lp) ubaid@ubaid-Lenovo-ideapad-330-15ARR:~/Desktop/Open-Source/lpython$ time python bench.py 
249975005
real    0m0.499s
user    0m0.374s
sys     0m0.125s
(lp) ubaid@ubaid-Lenovo-ideapad-330-15ARR:~/Desktop/Open-Source/lpython$ These values seems to differ significantly from release mode values shared at #1222 (comment). Is the build script used to build in release mode as expected or am I missing something? | 
| Measure master and this PR in Release mode and let's see. I can do the same. If the benchmarks in Release mode look good, we can merge. | 
| Compiled on ubuntu 22.04, using ./build0.sh
cmake .
cmake --build . -j16On main branch: (lp) lpython$ time lpython bench.py --backend wasm_x64 -o tmp.out
real    0m2.258s
user    0m2.218s
sys     0m0.041sOn this PR branch: (lp) lpython$ time lpython bench.py --backend wasm_x64 -o tmp.out
real    0m1.588s
user    0m1.508s
sys     0m0.080sCurrently benchmarked the  | 
| Ok. I think it's good enough. Go ahead and merge this. | 
| Thank you for the approval. | 
| I just noticed I also had another branch locally with slightly varied approach Vec<uint8_t> create_elf64_x86_binary(Allocator &al, X86Assembler &a) {
    const int E_IDX = 0;
    const int PROGRAM_IDX = E_IDX + sizeof(Elf64_Ehdr);
    const int TEXT_SEG_IDX = PROGRAM_IDX + sizeof(Elf64_Phdr);
    const int DATA_SEG_IDX = TEXT_SEG_IDX + sizeof(Elf64_Phdr);
    const int TOTAL_HEADER_SIZE = DATA_SEG_IDX + sizeof(Elf64_Phdr);
    Vec<uint8_t> binary;
    binary.resize(al, TOTAL_HEADER_SIZE);
    Elf64_Ehdr* e = (Elf64_Ehdr*)(binary.p + E_IDX);
    Elf64_Phdr* p_program = (Elf64_Phdr*)(binary.p + PROGRAM_IDX);
    Elf64_Phdr* p_text_seg = (Elf64_Phdr*)(binary.p + TEXT_SEG_IDX);
    Elf64_Phdr* p_data_seg = (Elf64_Phdr*)(binary.p + DATA_SEG_IDX);
    set_header(a, e);
    set_program_header(a, p_program);
    set_text_segment(a, p_program, p_text_seg);
    set_data_segment(a, p_text_seg, p_data_seg);
    align_by_byte(al, binary, 0x1000);
    const int PROGRAM_HEADER_SIZE = binary.size();
    e->entry = a.get_defined_symbol("_start").value + PROGRAM_HEADER_SIZE;
    p_program->filesz = PROGRAM_HEADER_SIZE;
    p_program->memsz = p_program->filesz;
    p_text_seg->offset = p_program->offset + p_program->filesz;
    p_text_seg->vaddr = a.origin() + p_text_seg->offset + PROGRAM_HEADER_SIZE;
    p_text_seg->paddr = p_text_seg->vaddr;
    p_data_seg->offset = p_text_seg->offset + p_text_seg->filesz;
    p_data_seg->vaddr = a.origin() + p_data_seg->offset + PROGRAM_HEADER_SIZE;
    p_data_seg->paddr = p_data_seg->vaddr;
    append_bytes(al, a.get_machine_code(), binary);
    return binary;
}where  void set_program_header(X86Assembler &a, Elf64_Phdr* p) {
    p->type = 1;
    p->flags = 4;
    p->offset = 0;
    p->vaddr = a.origin();
    p->paddr = a.origin();
    p->filesz = sizeof(Elf64_Ehdr) + 3 * sizeof(Elf64_Phdr);
    p->memsz = p->filesz;
    p->align = 0x1000;
} | 
| Sorry, I should have shared this approach #1694 (comment) early. I forgot about it while working on the new branch. | 
| The merged PR is fine with me. If you want to change it, go ahead and submit a PR. | 
fixes #1539.
Also, towards #1485.
PS: This is a work in progress, hence the tests might not work.